5  Random Forest Model

5.0.1 Overview

  1. import packages
  2. load data and define variables
  3. train model and make prediction for testing period
    • input to model
      • lags = previous x time steps to predict the next step
      • n_estimators: number of trees
  4. calcualte MSE for predicted and testing data
  5. save data (predicted NDVI and MSE) to netCDF file
import xarray as xr
import numpy as np
import pandas as pd
from darts import TimeSeries
from darts.models import RandomForest
from sklearn.metrics import mean_squared_error
import netCDF4 as nc
import matplotlib.pyplot as plt
# Function to load data from NetCDF file and defining variables
def load_nc_file(file_path):
    ds = xr.open_dataset(file_path)
    ndvi = ds['NDVI']
    times = ds['time']
    x = ds['x']
    y = ds['y']
    return ndvi, times, x, y

# turn data into darts TimeSeries
def prepare_darts_timeseries(ndvi_data, times):
    series_list = []
    for i in range(ndvi_data.shape[1]):  # iterate over x dimension
        for j in range(ndvi_data.shape[2]):  # iterate over y dimension
            values = ndvi_data[:, i, j]
            # replace nan values by zeros (assuming only pixels with just NaNs exist)
            values = np.nan_to_num(values, nan=0.0)
            time_index = pd.to_datetime(times, unit='s')
            series = TimeSeries.from_times_and_values(time_index, values)
            series_list.append(series)
    return series_list

# preparing data by turning them into darts TimeSeries
def prediction_series(train_ndvi_data, train_times, test_times):
    # Train Random Forest model
    model = RandomForest(
            lags=25,
            n_estimators=100)
    pred_series = []
    for i in range(train_ndvi_data.shape[1]):  # iterate over x dimension
        print(f'{i}/{train_ndvi_data.shape[1]}', end='\r')
        for j in range(train_ndvi_data.shape[2]):  # iterate over y dimension
            values = train_ndvi_data[:, i, j]
            # replace NaN values by zeros (assuming they only exist in pixels that are completely NaN)
            values = np.nan_to_num(values, nan=0.0)
            time_index = pd.to_datetime(train_times, unit='s')
            series = TimeSeries.from_times_and_values(time_index, values)
            # train model on training series
            model.fit(series)
            # predict using random forest model
            pred = model.predict(n=len(test_times))
            pred_series.append(pred)
    return pred_series

# Save predictions and MSE to a new NetCDF file
def save_to_nc_file(output_file, pred_data, mse_data, times, x, y):
    with nc.Dataset(output_file, 'w', format='NETCDF4') as ds:
        ds.createDimension('time', len(times))
        ds.createDimension('x', len(x))
        ds.createDimension('y', len(y))
        
        time_var = ds.createVariable('time', 'f4', ('time',))
        x_var = ds.createVariable('x', 'f4', ('x',))
        y_var = ds.createVariable('y', 'f4', ('y',))
        pred_var = ds.createVariable('pred_ndvi', 'f4', ('time', 'x', 'y'))
        mse_var = ds.createVariable('mse', 'f4', ('x', 'y'))
        
        time_var[:] = times
        x_var[:] = x
        y_var[:] = y
        
        pred_ndvi = np.array([pred.values().flatten() for pred in pred_data]).reshape((len(times), len(x), len(y)))
        pred_var[:] = pred_ndvi
        
        mse_var[:] = np.array(mse_list).reshape((len(x), len(y)))
# Load training and testing data
train_ndvi, train_times, train_x, train_y = load_nc_file(train_path+'ds_B_Cube_665_train.nc')
test_ndvi, test_times, test_x, test_y = load_nc_file(test_path+'Cube_665_test.nc')
# make prediction
pred_series = prediction_series(train_ndvi, train_times, test_times)
print("done with prediction")
done with prediction
# turn testing data into darts TimeSeries
test_series = prepare_darts_timeseries(test_ndvi, test_times)
# Calculate MSE for each pixel between prediction and testing data
mse_list = []
for pred, actual in zip(pred_series, test_series):
    mask = np.isfinite(actual.values().flatten())
    mse = mean_squared_error(actual.values().flatten()[mask], pred.values().flatten()[mask])
    mse_list.append(mse)
print("done with mse")

# Save the data (prediction and MSE)
save_to_nc_file('/home/the-patrician42/team-extra/ndvi-time-series-prediction/data/data_predictions/'
                +'Random_Forest_Cube_665.nc', pred_series, mse_list, test_times, test_x, test_y)
print("done saving")
done with mse
done saving